##############
# This script compiles the dataset with all ads published on Job-Room between January 2020 and May 2021
# Author: Jeremias Kl�ui
#############

# We agreed with SECO to run this code only during the night

library(data.table)
library(stringr)
library(lubridate)
library(httr)
library(jsonlite)
library(readxl)
library(tictoc)

rm(list = ls())

setwd(dirname(rstudioapi::getSourceEditorContext()$path))
getwd()

store <- "Misc_files/jobroom_ads/"

# We have to download the content of the ads from the Job-Room API
url<-"https://www.job-room.ch/jobadservice/api/jobAdvertisements/"

# start at ad number ...
start_nr<-1
end_nr<- 1173434 

# Read the ad-hashes from all ads published on the platform between January 2020 and May 2021 (these lists must be requested from the Swiss State Secretariat of Economic Affairs)

hashs1<-fread("data_raw/kof-ids_20200101-20200630.csv")
hashs2<-fread("data_raw/kof-ids_20200701-20201231.csv")
hashs3<-fread("data_raw/kof-ids_20210101-20210531.csv")

hashs<-rbind(hashs1,hashs2,hashs3)

# Split into junks of 2000 ads
hashs[, n:=1:.N]
hashs[, chunk:=floor(n/2000)+1]
setorder(hashs, chunk)
hashs[,.N, by=chunk]

# Define the ads we want 
do_hashs<-hashs[n>=start_nr & n<=end_nr,]

chunks<-split(do_hashs, by="chunk")

# Do calculation how long it will take
sleep<-0.02
sleep_after_chunk<-10
(nrow(do_hashs)*sleep+sleep_after_chunk*length(chunks))/3600
  
# Define the crawling function --------------------------------------------
get_job_content_from_url<- function (ad){
  Sys.sleep(.01)
  
  tryCatch(
    {
  resp<-GET(paste0(url, ad))
  
  err<-http_error(resp)
  
  if(err==F){
    jsonRespParsed<-content(resp,as="parsed")
    if(ad==jsonRespParsed$id){
      return(jsonRespParsed)
    }else{
      print("Error: ids not equal")
      stop(resp)
    }
  }else{
    print(err)
    print(resp)
    print("Waiting 15 min and trying again") 
    Sys.sleep(60*15)
    get_job_content_from_url(ad)
  }}, error=function(err)  {
    print(err)
    print(resp)
    print("Waiting 15 min and trying again")  
    Sys.sleep(60*15)
    get_job_content_from_url(ad)
  }
  )
  
}


# run the crawler over the chunks ------------------------

for(c in chunks){
  name<-as.character(c[, paste0(min(n),"_", max(n))])
  print(paste(Sys.time(), " CHUNK:",name))
  chunk_contents<-list(lapply(c[,id],get_job_content_from_url))
  saveRDS(chunk_contents, file=paste0(store, make.names(paste0("chunk_",name)), ".rds"))
  Sys.sleep(10)
}


# Process the information into a data table -------------------------------
  
files<-list.files(store, ".rds")

chunks_read<-lapply(files, function(x) {
  print(x)
  readRDS(paste0(store, x))
})

ad<-chunks_read[[1]][[1]][[15]]

# choose which variables we want. Note: more than one AVAM occupation code per job possible

tic()
important<-lapply(chunks_read, function(chunk) {
  lapply(chunk[[1]], function(ad){ 
    as.data.table(list("id"=ad$id,
                       "source"=ad$sourceSystem,
                       "external_url"=ifelse(is.null(ad$jobContent$externalUrl), NA, ad$jobContent$externalUrl),
                       "external_reference"=ifelse(is.null(ad$externalReference), NA, ad$externalReference),
                       "job_title"=ad$jobContent$jobDescriptions[[1]]$title,
                       "job_title_language"=ad$jobContent$jobDescriptions[[1]]$languageIsoCode,
                       "number_languages"=length(ad$jobContent$jobDescriptions),
                       "city"=ad$jobContent$location$city,
                       "postcode"=ad$jobContent$location$postalCode,
                       "communalcode"=ad$jobContent$location$communalCode,
                       "created"=ad$createdTime,
                       "publication_start_date"=ad$publication$startDate,
                       "publication_end_date"=ad$publication$endDate,
                       "company_name"=ad$jobContent$company$name,
                       "company_street"=ad$jobContent$company$street,
                       "company_housenumber"=ad$jobContent$company$houseNumber,
                       "company_plz"=ad$jobContent$company$postalCode,
                       "company_country"=ad$jobContent$company$countryIsoCode,
                       "start_date"=ad$jobContent$employment$startDate,
                       "end_date"=ad$jobContent$employment$endDate,
                       "short_employment"=ad$jobContent$employment$shortEmployment,
                       "immediate_start"=ad$jobContent$employment$immediately,
                       "workload_min"=ad$jobContent$employment$workloadPercentageMin,
                       "workload_max"=ad$jobContent$employment$workloadPercentageMax,
                       "permanent"=ad$jobContent$employment$permanent
    ))
  })
})


jr_ads_info<-lapply(important, function(chunk) rbindlist(chunk, fill=T))
jr_ads_info<-rbindlist(jr_ads_info, use.names = T, fill=T)
setnames(jr_ads_info, "id", "ad_hash")
toc()

# clean a bit
jr_ads_info[, source:=as.factor(source)]
jr_ads_info[, created:=as.Date(created)]
jr_ads_info[, publication_start_date:=as.Date(publication_start_date)]
jr_ads_info[, publication_end_date:=as.Date(publication_end_date)]
jr_ads_info[, company_country:=as.factor(company_country)]

# save
fwrite(jr_ads_info, "data_processed/job_ads_from_api.csv", row.names = F)


# Occupations -----------------------------------------------

# Note:more than one AVAM occupation per job possible
occupations<-lapply(chunks_read, function(chunk) {
  lapply(chunk[[1]], function(ad){ 
    as.data.table(list("id"=ad$id,
                       "avam_code"=  lapply(ad$jobContent$occupations, function(x) return(x$avamOccupationCode))
    ))
  })
})


ad_occupations<-lapply(occupations, function(chunk) rbindlist(chunk, fill=T))
ad_occupations<-rbindlist(ad_occupations, use.names = T)
setnames(ad_occupations, "id", "ad_hash")
setnames(ad_occupations, "avam_code", "cod_avam")

ad_occupations[, cod_avam:=as.integer(cod_avam)]

# Few ads have more than one occupation per ad
table(ad_occupations[, .N, by=ad_hash]$N)

### merge to isco #####
occ_mapping<-as.data.table(read_excel("Misc_files/AVAM_ISCO.xlsx"))

names(occ_mapping)<-make.names(names(occ_mapping))
setnames(occ_mapping, "AVAM.BN_NEU", "cod_avam")

# convert to the interational isco4
occ_mapping[, isco:=floor(NR_BERUFSART_CH.ISCO_NEU/10)]

# is the mapping unambiguous? 
table(occ_mapping[, uniqueN(isco), by=cod_avam]$V1)    # Yes

# merge
ad_occupations<-merge(ad_occupations, occ_mapping[,.(isco,cod_avam)], by="cod_avam", all.x=T)
ad_occupations[isco==0, isco:=NA]

# check match
ad_occupations[, sum(is.na(isco))/.N*100] # 1.6% of ads have no isco

fwrite(ad_occupations, "data_processed/job_ads_from_api_occupation.csv")
